1 Предварительный анализ данных

if (interactive() && !str_ends(getwd(), "R/stat_anal/CITY_US")) {
    setwd("R/stat_anal/CITY_US")
}

data <- read_excel("CITY_shortname.xls")
data[data == "NA"] <- NA
data[, -(1:2)] <- data.frame(lapply(data[, -(1:2)], as.numeric))
fullnames <- names(read_excel("CITY.xls"))

1.1 Разобраться в том, что означают признаки.

print(fullnames)
##  [1] "CITY City"                                                                            
##  [2] "STATE State"                                                                          
##  [3] "AREA Land area, 1990 (sq.miles)"                                                      
##  [4] "R_AREA Land area, 1990 (sq.miles), ranks"                                             
##  [5] "POP92 Population,1992"                                                                
##  [6] "R_POP92 Population,1992, ranks"                                                       
##  [7] "POP80 Population,1980"                                                                
##  [8] "R_POP80 Population,1980, ranks"                                                       
##  [9] "POP_CH Population growth rate,1980-1992"                                              
## [10] "R_POP_CH Population growth rate,1980-1992, ranks"                                     
## [11] "POPDEN Population per square mile, 1992"                                              
## [12] "R_POPDEN Population per square mile, 1992, ranks"                                     
## [13] "OLD % older 65 years"                                                                 
## [14] "R_OLD % older 65 years, ranks"                                                        
## [15] "BLACK Black population,1990"                                                          
## [16] "R_BLACK Black population,1990, ranks"                                                 
## [17] "BLACK% % Black population,1990"                                                       
## [18] "R_BLACK% % Black population,1990, ranks"                                              
## [19] "ASIAN Asian or Pacific Islander population,1990"                                      
## [20] "R_ASIAN Asian or Pacific Islander population,1990, ranks"                             
## [21] "ASIAN% % Asian or Pacific Islander population,1990"                                   
## [22] "R_ASIAN% % Asian or Pacific Islander population,1990, ranks"                          
## [23] "HISP Hispanic population, 1990"                                                       
## [24] "R_HISP Hispanic population, 1990, ranks"                                              
## [25] "HISP% % Hispanic population, 1990"                                                    
## [26] "R_HISP% % Hispanic population, 1990, ranks"                                           
## [27] "BORN_F % persons of foreign born"                                                     
## [28] "R_BORN_F % persons of foreign born, ranks"                                            
## [29] "LANG % of persons, speaking language other than English at home, 1990"                
## [30] "R_LANG % of persons, speaking language other than English at home, 1990, ranks"       
## [31] "HH1 % 1-person households, 1990"                                                      
## [32] "R_HH1 % 1-person households, 1990, ranks"                                             
## [33] "FAMIL1 % one-parent family households, 1990"                                          
## [34] "R_FAMIL1 % one-parent family households, 1990, ranks"                                 
## [35] "DEATH Infant death rate per 1000 live births,1988"                                    
## [36] "R_DEATH Infant death rate per 1000 live births,1988, ranks"                           
## [37] "CRIME Crime rate per 100000 population, 1991"                                         
## [38] "R_CRIME Crime rate per 100000 population, 1991, rate"                                 
## [39] "SCHOOL % of elementary and high school enrollment in public schools, 1990"            
## [40] "R_SCHOOL % of elementary and high school enrollment in public schools, 1990, ranks"   
## [41] "DEGREE % of adults with a bachelor's degree or higher, 1990"                          
## [42] "R_DEGREE % of adults with a bachelor's degree or higher, 1990, ranks"                 
## [43] "INCOME Median household income, 1989"                                                 
## [44] "R_INCOME Median household income, 1989, ranks"                                        
## [45] "ASSIST % of households, receiving public assistance,1989"                             
## [46] "R_ASSIST % of households, receiving public assistance,1989, ranks"                    
## [47] "POVERT % of persons below poverty level, 1989"                                        
## [48] "R_POVERT % of persons below poverty level, 1989, ranks"                               
## [49] "OLB_BIL % of housing units built 1939 or earlier, 1990"                               
## [50] "R_OLD_BI % of housing units built 1939 or earlier, 1990, ranks"                       
## [51] "OWNER Median value of specified owner-occupied housing units ($), 1990"               
## [52] "R_OWNER Median value of specified owner-occupied housing units ($), 1990, ranks"      
## [53] "RENTER % renter-occupied housing units,1990"                                          
## [54] "R_RENTER % renter-occupied housing units,1990, ranks"                                 
## [55] "GROSS Median gross rent of specified renter-occupied housing units ($), 1990"         
## [56] "R_GROSS Median gross rent of specified renter-occupied housing units ($), 1990, ranks"
## [57] "CONDOM % condominimum occupied housing units, 1990"                                   
## [58] "R_CONDOM % condominimum occupied housing units, 1990, ranks"                          
## [59] "TRANSP % of workers using public transportation"                                      
## [60] "R_TRANSP % of workers using public transportation, ranks"                             
## [61] "UNEMP Unemployment rate, 1991"                                                        
## [62] "R_UNEMP Unemployment rate, 1991, ranks"                                               
## [63] "LABOR Labor force - % change 1980-1990"                                               
## [64] "R_LABOR Labor force - % change 1980-1990, ranks"                                      
## [65] "LAB_F Female civilian labor force participation rate, 1990"                           
## [66] "R_LAB_F Female civilian labor force participation rate, 1990, ranks"                  
## [67] "MANLAB % of civilian labor force emploied in manufacturing, 1990"                     
## [68] "R_MANLAB % of civilian labor force emploied in manufacturing, 1990, ranks"            
## [69] "TAXE City government taxes per capita, 1991"                                          
## [70] "R_TAXE City government taxes per capita, 1991, ranks"                                 
## [71] "TEMPER Average daily temperature in July"                                             
## [72] "R_TEMPER Average daily temperature in July, ranks"                                    
## [73] "PRECEP Annual precipitation"                                                          
## [74] "R_PRECEP Annual precipitation"

1.2 Отобрать признаки

names_interesting <- c("AREA", "POP80", "POP92", "POPDEN", "CRIME", "BORN_F", "POVERT", "INCOME", "UNEMP", "TEMPER")

data <- data %>% select(all_of(c("CITY", "STATE", names_interesting)))

print(head(data))
## # A tibble: 6 × 12
##   CITY  STATE  AREA  POP80  POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
##   <chr> <chr> <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>
## 1 NEW_… NY     309. 7.07e6 7.31e6  23671  9236   28.4   19.3  29823   8.6   76.8
## 2 LOS_… CA     469. 2.97e6 3.49e6   7436  9730   38.4   18.9  30925   9     74.3
## 3 CHIC… IL     227. 3.01e6 2.77e6  12185    NA   16.9   21.6  26301   8.4   75.1
## 4 HOUS… TX     540. 1.60e6 1.69e6   3131 10824   17.8   20.7  26261   6.1   83.5
## 5 PHIL… PA     135. 1.69e6 1.55e6  11492  6835    6.6   20.3  24603   8     76.7
## 6 SAN_… CA     324  8.76e5 1.15e6   3546  8537   20.9   13.4     10   6.2   71

1.3 Определить вид признаков

Город и штат качественные, остальные количественные, ранги были порядковыми.

find_mode_freq <- function(x) {
    x <- x[!is.na(x)]
    return(max(tabulate(match(x, x))))
}

print(data %>% summarise(across(
    all_of(names_interesting),
    find_mode_freq
)))
## # A tibble: 1 × 10
##    AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
##   <int> <int> <int>  <int> <int>  <int>  <int>  <int> <int>  <int>
## 1     1     1     1      2     1      4      2      1     5      3
print(sort(data$UNEMP))
##  [1]  2.3  3.2  3.8  3.9  4.1  4.4  4.6  4.7  4.7  4.8  4.8  5.0  5.0  5.0  5.0
## [16]  5.1  5.1  5.1  5.3  5.4  5.4  5.4  5.4  5.4  5.6  5.7  5.9  5.9  5.9  6.0
## [31]  6.0  6.1  6.1  6.1  6.1  6.2  6.2  6.4  6.4  6.5  6.7  6.7  6.7  6.8  6.9
## [46]  6.9  7.0  7.1  7.2  7.2  7.3  7.3  7.3  7.5  7.6  7.7  7.7  7.7  8.0  8.1
## [61]  8.4  8.4  8.5  8.6  9.0  9.0  9.4  9.5  9.6 10.0 10.4 10.6 10.7 11.0 11.9
## [76] 12.6 13.1

Все количественные буду считать непрерывными. возможно UNEMP непрерывный с плохой точностью.

1.4 не актуально

1.5 Построить matrix plot

if (interactive()) pdf("ggpairs_unedited.pdf")
ggpairs(
    data[, -(1:2)],
    lower = list(continuous = wrap("points", alpha = 0.5, size = 0.3)),
    diag = list(continuous = "barDiag")
)

if (interactive()) dev.off()

1.7 outliers

Убираю outliers:

Помечаю некорректные данные в INCOME как NA. Удаляю город из Аляски за плотность населения. Флорида выделсется на BORN_F-INCOME Гаваи выделяются низким уровнем безработицы. странно?

data$INCOME[data$INCOME < 100] <- NA
data <- data %>% filter(STATE != "AK")

1.6 Несимметричные распределения

Функция, которая логарифмирует, если это сделает выборку симметричнее

log_asymmetric <- function(x) {
    if (skewness(x, na.rm = TRUE) < abs(skewness(log(x), na.rm = TRUE))) {
        print("default")
        return(x)
    } else {
        print("logged")
        return(log(x))
    }
}

Автоматически логарифмирую то что имеет асимметрию и длинный хвост справа

data_logged <- data %>%
    mutate(across(all_of(names_interesting), log_asymmetric))
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "default"
## [1] "default"
## [1] "logged"
## [1] "default"
# if (interactive()) pdf("ggpairs_logged.pdf")
# ggpairs(
#     data_logged[, -(1:2)],
#     lower = list(continuous = wrap("points", alpha = 0.5, size = 0.3)),
#     diag = list(continuous = "barDiag")
# )
# if (interactive()) dev.off()

1.8 однородность

выглядит однородно.

.9 не актуально

1.10 всякие характеристики

print_characteristics <- function(x) {
    list(
        mean = mean(x, na.rm = TRUE),
        var = var(x, na.rm = TRUE),
        skewness = skewness(x, na.rm = TRUE),
        kurtosis = kurtosis(x, na.rm = TRUE) - 3
    )
}

sapply(data_logged %>% select(all_of
(names_interesting)), print_characteristics)
##          AREA       POP80     POP92     POPDEN     CRIME      BORN_F    
## mean     4.754638   12.9069   13.01226  8.257586   9.206558   1.959747  
## var      0.6658984  0.5142076 0.4464288 0.5099096  0.06798296 0.8435754 
## skewness 0.1317646  1.453879  1.743773  0.1015321  0.1476689  0.3081197 
## kurtosis -0.3907363 3.033026  3.885752  -0.1531417 0.08872069 -0.7431516
##          POVERT     INCOME     UNEMP      TEMPER    
## mean     18.11842   25282.48   1.873674   77.60132  
## var      34.68872   14321837   0.09967048 37.59853  
## skewness 0.2243883  -0.2196584 -0.2186434 -0.1426746
## kurtosis -0.3539569 -0.6467894 0.6594639  0.7472063

регрессия

df <- data_logged %>%
    select(-CITY, -STATE) %>%
    drop_na()
df_names <- data_logged %>%
    drop_na()

ggplot(melt(cor(df))) +
    geom_raster(aes(x = Var2, y = Var1, fill = value)) +
    geom_text(aes(x = Var2, y = Var1, label = round(value, 2))) +
    scale_fill_gradient2() +
    theme_dark() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

2 Задаете зависимые и ‘независимые’ переменные (регрессоры), не забываете обратить внимание на выбор способа обработки пропущенных наблюдений, функция lm.

model_ <- lm(CRIME ~ ., data = df, na.action = na.exclude)
model <- lm.beta(model_)
summary(model)
## 
## Call:
## lm(formula = CRIME ~ ., data = df, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41586 -0.11001 -0.00454  0.11585  0.46619 
## 
## Coefficients:
##               Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  9.347e+00           NA  8.941e-01  10.453 1.11e-14 ***
## AREA        -6.914e+01   -2.356e+02  1.949e+02  -0.355   0.7241    
## POP80        4.567e-01    1.292e+00  2.379e-01   1.920   0.0601 .  
## POP92        6.863e+01    1.850e+02  1.949e+02   0.352   0.7261    
## POPDEN      -6.926e+01   -2.031e+02  1.948e+02  -0.355   0.7236    
## BORN_F       1.232e-01    4.412e-01  4.791e-02   2.572   0.0128 *  
## POVERT       1.586e-02    3.506e-01  1.231e-02   1.288   0.2030    
## INCOME       6.049e-06    9.536e-02  1.659e-05   0.365   0.7167    
## UNEMP        1.267e-01    1.575e-01  1.263e-01   1.003   0.3204    
## TEMPER       8.670e-03    2.107e-01  5.543e-03   1.564   0.1235    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2107 on 55 degrees of freedom
## Multiple R-squared:  0.3476, Adjusted R-squared:  0.2409 
## F-statistic: 3.256 on 9 and 55 DF,  p-value: 0.003072
ggplot(melt(cov2cor(vcov(model)))) +
    geom_raster(aes(x = Var2, y = Var1, fill = value)) +
    geom_text(aes(x = Var2, y = Var1, label = round(value, 2))) +
    scale_fill_gradient2() +
    theme_dark() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# cov2cor(vcov(model))

3 Интерпретируете результаты регрессии, включая результат lm.beta (в частности, о разнице между b и beta, о значимости и пр.).

hist(residuals(model))

shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.98567, p-value = 0.656
plot(fitted(model), residuals(model))

abline(h = 0, lty = 2)

4 Далее есть три проблемы, из-за которых результаты регрессии могут быть неправильными

линейная модель регрессии не соответствует данным в данных

могут быть сильно зависимые «независимые» переменные

есть население в 80 и 92 и другие зависимые

также могут быть outliers.

outliers уже убраны, если какие-то и остались, это не так критично

5 Как строятся доверительные интервалы и двумерные доверительные области.

На примере пары признаков строите двумерный доверительный интервал для пары значащих коэффициентов регрессии, интерпретируете: (1) оба признака влияют на результат согласно оценкам коэффициентов регрессии перед ними: или (2) признаки вместе сильно влияют, но не различить, какой из них больше; или (3) непонятно, или оба признака слабо влияют, или оба влияют сильно.

plot_ellipse <- function(model_, id) {
    # confidenceEllipse(lm.beta(model_), which.coef = id, vcov. = function(x) cov2cor(vcov(x)))
    # points(0, 0, col = "red", pch = 19)
    # lines(x = c(0, coef(lm.beta(model_))[id[1]]), c(0, coef(lm.beta(model_))[id[2]]), col = "red")
    confidenceEllipse(model_, which.coef = id)
    points(0, 0, col = "red", pch = 19)
    lines(x = c(0, coef(model_)[id[1]]), c(0, coef(model_)[id[2]]), col = "red")

    # if (id[1] != 1 && id[2] != 1) {
    #     confidenceEllipse(lm.beta(model_), which.coef = id, vcov. = function(x) cov2cor(vcov(x)))
    #     points(0, 0, col = "red", pch = 19)
    #     lines(x = c(0, coef(lm.beta(model_))[id[1]]), c(0, coef(lm.beta(model_))[id[2]]), col = "red")
    # }
}
plot_ellipse(model_, c(3, 6))

plot_ellipse(model_, c(3, 7))

# plot_ellipse(model, c(3, 8))
# plot_ellipse(model, c(3, 9))
# plot_ellipse(model, c(3, 10))
# plot_ellipse(model_, c(4, 5))
# plot_ellipse(model_, c(6, 7))

6 На примере с двумя «независимыми» признаками пишете формулы и показываете, как корреляция между признаками влияет на качество оценок регрессии.

7 Нужно избавляться от лишних, избыточных, признаков.

Сделаем это вручную на основе таблицы Redundancy. Там «независимые переменные» сравниваются по двум критериям - независимость от других «независимых» признаков и зависимость от зависимой переменной. Объясняете, что означают столбцы, что делать, если эти критерии дают противоречивые рекомендации, решаете, какой признак лучше убрать первым.

# ols_vif_tol(model_)
# ols_correlations(model_)

8 Убираете вручную на основе Redundancy несколько признаков и смотрите, что меняется (R, adjusted R, значимость регрессии, значимость коэффициентов регрессии, независимость «независимых» переменных, AIC).

model_manual <- lm(CRIME ~ ., data = df, na.action = na.exclude)
summary(model_manual)
## 
## Call:
## lm(formula = CRIME ~ ., data = df, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41586 -0.11001 -0.00454  0.11585  0.46619 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.347e+00  8.941e-01  10.453 1.11e-14 ***
## AREA        -6.914e+01  1.949e+02  -0.355   0.7241    
## POP80        4.567e-01  2.379e-01   1.920   0.0601 .  
## POP92        6.863e+01  1.949e+02   0.352   0.7261    
## POPDEN      -6.926e+01  1.948e+02  -0.355   0.7236    
## BORN_F       1.232e-01  4.791e-02   2.572   0.0128 *  
## POVERT       1.586e-02  1.231e-02   1.288   0.2030    
## INCOME       6.049e-06  1.659e-05   0.365   0.7167    
## UNEMP        1.267e-01  1.263e-01   1.003   0.3204    
## TEMPER       8.670e-03  5.543e-03   1.564   0.1235    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2107 on 55 degrees of freedom
## Multiple R-squared:  0.3476, Adjusted R-squared:  0.2409 
## F-statistic: 3.256 on 9 and 55 DF,  p-value: 0.003072
ols_vif_tol(model_manual)
##   Variables    Tolerance          VIF
## 1      AREA 2.689858e-08 3.717668e+07
## 2     POP80 2.618028e-02 3.819668e+01
## 3     POP92 4.298257e-08 2.326524e+07
## 4    POPDEN 3.633518e-08 2.752154e+07
## 5    BORN_F 4.032272e-01 2.479992e+00
## 6    POVERT 1.601332e-01 6.244802e+00
## 7    INCOME 1.735122e-01 5.763285e+00
## 8     UNEMP 4.810074e-01 2.078970e+00
## 9    TEMPER 6.540152e-01 1.529016e+00
ols_correlations(model_manual)
##                Correlations                 
## -------------------------------------------
## Variable    Zero Order    Partial     Part  
## -------------------------------------------
## AREA            -0.032     -0.048    -0.039 
## POP80            0.028      0.251     0.209 
## POP92           -0.015      0.047     0.038 
## POPDEN           0.023     -0.048    -0.039 
## BORN_F           0.200      0.328     0.280 
## POVERT           0.411      0.171     0.140 
## INCOME          -0.299      0.049     0.040 
## UNEMP            0.369      0.134     0.109 
## TEMPER           0.162      0.206     0.170 
## -------------------------------------------
AIC(model_manual)
## [1] -6.855636

Удаляю AREA из-за VIF и partial

model_manual <- lm(CRIME ~ POP80 + POP92 + POPDEN + INCOME + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
## 
## Call:
## lm(formula = CRIME ~ POP80 + POP92 + POPDEN + INCOME + BORN_F + 
##     POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4236 -0.1177 -0.0064  0.1200  0.4580 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.434e+00  8.525e-01  11.067 1.02e-15 ***
## POP80        4.704e-01  2.329e-01   2.020   0.0482 *  
## POP92       -5.210e-01  2.494e-01  -2.089   0.0413 *  
## POPDEN      -1.319e-01  5.602e-02  -2.354   0.0221 *  
## INCOME       6.354e-06  1.644e-05   0.387   0.7005    
## BORN_F       1.250e-01  4.729e-02   2.643   0.0106 *  
## POVERT       1.573e-02  1.221e-02   1.288   0.2029    
## UNEMP        1.327e-01  1.242e-01   1.068   0.2899    
## TEMPER       8.346e-03  5.424e-03   1.539   0.1295    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.209 on 56 degrees of freedom
## Multiple R-squared:  0.3461, Adjusted R-squared:  0.2527 
## F-statistic: 3.705 on 8 and 56 DF,  p-value: 0.001553
ols_vif_tol(model_manual)
##   Variables  Tolerance       VIF
## 1     POP80 0.02688552 37.194742
## 2     POP92 0.02584005 38.699621
## 3    POPDEN 0.43265704  2.311299
## 4    INCOME 0.17397908  5.747817
## 5    BORN_F 0.40744566  2.454315
## 6    POVERT 0.16027769  6.239172
## 7     UNEMP 0.48984345  2.041469
## 8    TEMPER 0.67225320  1.487535
ols_correlations(model_manual)
##                Correlations                 
## -------------------------------------------
## Variable    Zero Order    Partial     Part  
## -------------------------------------------
## POP80            0.028      0.261     0.218 
## POP92           -0.015     -0.269    -0.226 
## POPDEN           0.023     -0.300    -0.254 
## INCOME          -0.299      0.052     0.042 
## BORN_F           0.200      0.333     0.286 
## POVERT           0.411      0.170     0.139 
## UNEMP            0.369      0.141     0.115 
## TEMPER           0.162      0.201     0.166 
## -------------------------------------------
AIC(model_manual)
## [1] -8.707016

Удаляю POP80 из-за VIF

model_manual <- lm(CRIME ~ POP92 + POPDEN + INCOME + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
## 
## Call:
## lm(formula = CRIME ~ POP92 + POPDEN + INCOME + BORN_F + POVERT + 
##     UNEMP + TEMPER, data = df, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45095 -0.12197 -0.02222  0.12243  0.48281 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.325e+00  8.735e-01  10.677 3.23e-15 ***
## POP92       -2.578e-02  4.679e-02  -0.551   0.5837    
## POPDEN      -9.384e-02  5.416e-02  -1.733   0.0886 .  
## INCOME      -2.185e-06  1.631e-05  -0.134   0.8938    
## BORN_F       7.990e-02  4.280e-02   1.867   0.0671 .  
## POVERT       1.682e-02  1.252e-02   1.343   0.1845    
## UNEMP        1.441e-01  1.274e-01   1.132   0.2624    
## TEMPER       4.401e-03  5.195e-03   0.847   0.4005    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2146 on 57 degrees of freedom
## Multiple R-squared:  0.2985, Adjusted R-squared:  0.2124 
## F-statistic: 3.465 on 7 and 57 DF,  p-value: 0.003686
ols_vif_tol(model_manual)
##   Variables Tolerance      VIF
## 1     POP92 0.7738140 1.292300
## 2    POPDEN 0.4878151 2.049957
## 3    INCOME 0.1863123 5.367333
## 4    BORN_F 0.5242427 1.907513
## 5    POVERT 0.1605911 6.226996
## 6     UNEMP 0.4908708 2.037196
## 7    TEMPER 0.7724648 1.294557
ols_correlations(model_manual)
##                Correlations                 
## -------------------------------------------
## Variable    Zero Order    Partial     Part  
## -------------------------------------------
## POP92           -0.015     -0.073    -0.061 
## POPDEN           0.023     -0.224    -0.192 
## INCOME          -0.299     -0.018    -0.015 
## BORN_F           0.200      0.240     0.207 
## POVERT           0.411      0.175     0.149 
## UNEMP            0.369      0.148     0.126 
## TEMPER           0.162      0.112     0.094 
## -------------------------------------------
AIC(model_manual)
## [1] -6.137272

Удаляю INCOME из-за VIF и Partial

model_manual <- lm(CRIME ~ POP92 + POPDEN + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
## 
## Call:
## lm(formula = CRIME ~ POP92 + POPDEN + BORN_F + POVERT + UNEMP + 
##     TEMPER, data = df, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4482 -0.1272 -0.0226  0.1197  0.4762 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.278364   0.793016  11.700  < 2e-16 ***
## POP92       -0.027764   0.044014  -0.631  0.53065    
## POPDEN      -0.093971   0.053691  -1.750  0.08537 .  
## BORN_F       0.077684   0.039156   1.984  0.05200 .  
## POVERT       0.018241   0.006621   2.755  0.00783 ** 
## UNEMP        0.141187   0.124362   1.135  0.26092    
## TEMPER       0.004414   0.005150   0.857  0.39493    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2128 on 58 degrees of freedom
## Multiple R-squared:  0.2983, Adjusted R-squared:  0.2257 
## F-statistic: 4.109 on 6 and 58 DF,  p-value: 0.001672
ols_vif_tol(model_manual)
##   Variables Tolerance      VIF
## 1     POP92 0.8595873 1.163349
## 2    POPDEN 0.4879834 2.049250
## 3    BORN_F 0.6158375 1.623805
## 4    POVERT 0.5648207 1.770473
## 5     UNEMP 0.5060650 1.976031
## 6    TEMPER 0.7727314 1.294111
ols_correlations(model_manual)
##                Correlations                 
## -------------------------------------------
## Variable    Zero Order    Partial     Part  
## -------------------------------------------
## POP92           -0.015     -0.083    -0.069 
## POPDEN           0.023     -0.224    -0.193 
## BORN_F           0.200      0.252     0.218 
## POVERT           0.411      0.340     0.303 
## UNEMP            0.369      0.147     0.125 
## TEMPER           0.162      0.112     0.094 
## -------------------------------------------
AIC(model_manual)
## [1] -8.116788

Удаляю POP92 из-за Partial

model_manual <- lm(CRIME ~ POPDEN + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
## 
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT + UNEMP + TEMPER, 
##     data = df, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44159 -0.12102 -0.04726  0.12545  0.47496 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.001920   0.657539  13.690  < 2e-16 ***
## POPDEN      -0.100061   0.052545  -1.904  0.06175 .  
## BORN_F       0.072774   0.038178   1.906  0.06150 .  
## POVERT       0.018301   0.006586   2.779  0.00731 ** 
## UNEMP        0.144104   0.123640   1.166  0.24850    
## TEMPER       0.004003   0.005082   0.788  0.43410    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2117 on 59 degrees of freedom
## Multiple R-squared:  0.2935, Adjusted R-squared:  0.2336 
## F-statistic: 4.901 on 5 and 59 DF,  p-value: 0.0008112
ols_vif_tol(model_manual)
##   Variables Tolerance      VIF
## 1    POPDEN 0.5042890 1.982990
## 2    BORN_F 0.6411805 1.559623
## 3    POVERT 0.5649394 1.770101
## 4     UNEMP 0.5067659 1.973298
## 5    TEMPER 0.7853115 1.273380
ols_correlations(model_manual)
##                Correlations                 
## -------------------------------------------
## Variable    Zero Order    Partial     Part  
## -------------------------------------------
## POPDEN           0.023     -0.241    -0.208 
## BORN_F           0.200      0.241     0.209 
## POVERT           0.411      0.340     0.304 
## UNEMP            0.369      0.150     0.128 
## TEMPER           0.162      0.102     0.086 
## -------------------------------------------
AIC(model_manual)
## [1] -9.672381

Удаляю TEMPER из-за Partial

model_manual <- lm(CRIME ~ POPDEN + BORN_F + POVERT + UNEMP, data = df, na.action = na.exclude)
summary(model_manual)
## 
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT + UNEMP, data = df, 
##     na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43465 -0.11715 -0.04416  0.13301  0.47304 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.446971   0.335126  28.189  < 2e-16 ***
## POPDEN      -0.118801   0.046700  -2.544  0.01356 *  
## BORN_F       0.079726   0.037025   2.153  0.03533 *  
## POVERT       0.018828   0.006532   2.883  0.00547 ** 
## UNEMP        0.143795   0.123247   1.167  0.24794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.211 on 60 degrees of freedom
## Multiple R-squared:  0.286,  Adjusted R-squared:  0.2384 
## F-statistic:  6.01 on 4 and 60 DF,  p-value: 0.0003905
ols_vif_tol(model_manual)
##   Variables Tolerance      VIF
## 1    POPDEN 0.6343884 1.576321
## 2    BORN_F 0.6773993 1.476234
## 3    POVERT 0.5708139 1.751885
## 4     UNEMP 0.5067710 1.973278
ols_correlations(model_manual)
##                Correlations                 
## -------------------------------------------
## Variable    Zero Order    Partial     Part  
## -------------------------------------------
## POPDEN           0.023     -0.312    -0.278 
## BORN_F           0.200      0.268     0.235 
## POVERT           0.411      0.349     0.314 
## UNEMP            0.369      0.149     0.127 
## -------------------------------------------
AIC(model_manual)
## [1] -10.99261

Удаляю UNEMP из-за Partial

model_manual <- lm(CRIME ~ POPDEN + BORN_F + POVERT, data = df, na.action = na.exclude)
summary(model_manual)
## 
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT, data = df, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4113 -0.1241 -0.0199  0.1491  0.4475 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.536704   0.327146  29.151  < 2e-16 ***
## POPDEN      -0.109661   0.046174  -2.375   0.0207 *  
## BORN_F       0.093070   0.035319   2.635   0.0106 *  
## POVERT       0.023184   0.005375   4.313 5.98e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2116 on 61 degrees of freedom
## Multiple R-squared:  0.2698, Adjusted R-squared:  0.2339 
## F-statistic: 7.514 on 3 and 61 DF,  p-value: 0.000233
ols_vif_tol(model_manual)
##   Variables Tolerance      VIF
## 1    POPDEN 0.6527595 1.531958
## 2    BORN_F 0.7488569 1.335369
## 3    POVERT 0.8479393 1.179330
ols_correlations(model_manual)
##                Correlations                 
## -------------------------------------------
## Variable    Zero Order    Partial     Part  
## -------------------------------------------
## POPDEN           0.023     -0.291    -0.260 
## BORN_F           0.200      0.320     0.288 
## POVERT           0.411      0.483     0.472 
## -------------------------------------------
AIC(model_manual)
## [1] -11.53442

все

9 Далее переходите к автоматической пошаговой регрессии по AIC. По результатам определяете, сколько признаков оставить. Сравниваете результаты Forward и Backward вариантов.

model.start <- lm(CRIME ~ 1, data = df, na.action = na.exclude)
model.end <- model_

stepAIC(model.start, direction = "forward", scope = list(lower = model.start, upper = model.end))
## Start:  AIC=-183.55
## CRIME ~ 1
## 
##          Df Sum of Sq    RSS     AIC
## + POVERT  1   0.63194 3.1104 -193.58
## + UNEMP   1   0.50967 3.2326 -191.07
## + INCOME  1   0.33444 3.4079 -187.64
## + BORN_F  1   0.15043 3.5919 -184.22
## <none>                3.7423 -183.55
## + TEMPER  1   0.09834 3.6440 -183.28
## + AREA    1   0.00390 3.7384 -181.62
## + POP80   1   0.00296 3.7394 -181.61
## + POPDEN  1   0.00206 3.7403 -181.59
## + POP92   1   0.00087 3.7414 -181.57
## 
## Step:  AIC=-193.58
## CRIME ~ POVERT
## 
##          Df Sum of Sq    RSS     AIC
## + TEMPER  1  0.140358 2.9700 -194.58
## + BORN_F  1  0.125238 2.9851 -194.25
## <none>                3.1104 -193.58
## + UNEMP   1  0.085932 3.0244 -193.40
## + POPDEN  1  0.066834 3.0435 -192.99
## + AREA    1  0.027661 3.0827 -192.16
## + INCOME  1  0.019274 3.0911 -191.98
## + POP80   1  0.007771 3.1026 -191.74
## + POP92   1  0.003654 3.1067 -191.65
## 
## Step:  AIC=-194.58
## CRIME ~ POVERT + TEMPER
## 
##          Df Sum of Sq    RSS     AIC
## + BORN_F  1  0.127679 2.8423 -195.43
## + UNEMP   1  0.104053 2.8660 -194.90
## <none>                2.9700 -194.58
## + INCOME  1  0.022169 2.9478 -193.06
## + POPDEN  1  0.013926 2.9561 -192.88
## + POP92   1  0.005962 2.9641 -192.71
## + POP80   1  0.004451 2.9656 -192.68
## + AREA    1  0.000834 2.9692 -192.60
## 
## Step:  AIC=-195.43
## CRIME ~ POVERT + TEMPER + BORN_F
## 
##          Df Sum of Sq    RSS     AIC
## + POPDEN  1  0.137392 2.7050 -196.66
## <none>                2.8423 -195.43
## + POP92   1  0.042609 2.7997 -194.42
## + UNEMP   1  0.035760 2.8066 -194.26
## + POP80   1  0.022474 2.8199 -193.95
## + AREA    1  0.007277 2.8351 -193.60
## + INCOME  1  0.004518 2.8378 -193.54
## 
## Step:  AIC=-196.65
## CRIME ~ POVERT + TEMPER + BORN_F + POPDEN
## 
##          Df Sum of Sq    RSS     AIC
## <none>                2.7050 -196.66
## + UNEMP   1  0.060878 2.6441 -196.13
## + AREA    1  0.020548 2.6844 -195.15
## + POP92   1  0.020537 2.6844 -195.15
## + POP80   1  0.004259 2.7007 -194.76
## + INCOME  1  0.000961 2.7040 -194.68
## 
## Call:
## lm(formula = CRIME ~ POVERT + TEMPER + BORN_F + POPDEN, data = df, 
##     na.action = na.exclude)
## 
## Coefficients:
## (Intercept)       POVERT       TEMPER       BORN_F       POPDEN  
##    9.093936     0.022670     0.003984     0.086179    -0.090989
stepAIC(model_, direction = "backward")
## Start:  AIC=-193.32
## CRIME ~ AREA + POP80 + POP92 + POPDEN + BORN_F + POVERT + INCOME + 
##     UNEMP + TEMPER
## 
##          Df Sum of Sq    RSS     AIC
## - POP92   1  0.005505 2.4469 -195.17
## - AREA    1  0.005589 2.4470 -195.17
## - POPDEN  1  0.005610 2.4470 -195.17
## - INCOME  1  0.005904 2.4473 -195.16
## - UNEMP   1  0.044635 2.4861 -194.14
## - POVERT  1  0.073682 2.5151 -193.38
## <none>                2.4414 -193.32
## - TEMPER  1  0.108623 2.5500 -192.49
## - POP80   1  0.163609 2.6050 -191.10
## - BORN_F  1  0.293714 2.7351 -187.93
## 
## Step:  AIC=-195.17
## CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + INCOME + UNEMP + 
##     TEMPER
## 
##          Df Sum of Sq    RSS     AIC
## - INCOME  1  0.006532 2.4535 -197.00
## - UNEMP   1  0.049840 2.4968 -195.86
## - POVERT  1  0.072543 2.5195 -195.27
## <none>                2.4469 -195.17
## - TEMPER  1  0.103532 2.5505 -194.48
## - POP80   1  0.178301 2.6252 -192.60
## - AREA    1  0.190764 2.6377 -192.29
## - POPDEN  1  0.252540 2.6995 -190.79
## - BORN_F  1  0.305201 2.7521 -189.53
## 
## Step:  AIC=-197
## CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + UNEMP + TEMPER
## 
##          Df Sum of Sq    RSS     AIC
## - UNEMP   1   0.05839 2.5118 -197.47
## <none>                2.4535 -197.00
## - TEMPER  1   0.09910 2.5526 -196.42
## - POVERT  1   0.11872 2.5722 -195.93
## - POP80   1   0.17260 2.6261 -194.58
## - AREA    1   0.18745 2.6409 -194.21
## - POPDEN  1   0.25177 2.7052 -192.65
## - BORN_F  1   0.33852 2.7920 -190.60
## 
## Step:  AIC=-197.47
## CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + TEMPER
## 
##          Df Sum of Sq    RSS     AIC
## <none>                2.5118 -197.47
## - TEMPER  1   0.09932 2.6112 -196.95
## - POP80   1   0.17256 2.6844 -195.15
## - AREA    1   0.18885 2.7007 -194.76
## - POPDEN  1   0.24590 2.7577 -193.40
## - POVERT  1   0.29614 2.8080 -192.22
## - BORN_F  1   0.44075 2.9526 -188.96
## 
## Call:
## lm(formula = CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + 
##     TEMPER, data = df, na.action = na.exclude)
## 
## Coefficients:
## (Intercept)         AREA        POP80       POPDEN       BORN_F       POVERT  
##    9.664651    -0.493029     0.447200    -0.613315     0.142206     0.016202  
##      TEMPER  
##    0.008129
stepAIC(model_manual, direction = "both", scope = list(lower = model.start, upper = model.end))
## Start:  AIC=-198
## CRIME ~ POPDEN + BORN_F + POVERT
## 
##          Df Sum of Sq    RSS     AIC
## <none>                2.7325 -198.00
## + UNEMP   1   0.06062 2.6719 -197.46
## + TEMPER  1   0.02754 2.7049 -196.66
## + AREA    1   0.01468 2.7178 -196.35
## + POP92   1   0.01468 2.7178 -196.35
## + POP80   1   0.00309 2.7294 -196.07
## + INCOME  1   0.00075 2.7317 -196.01
## - POPDEN  1   0.25265 2.9851 -194.25
## - BORN_F  1   0.31106 3.0435 -192.99
## - POVERT  1   0.83344 3.5659 -182.69
## 
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT, data = df, na.action = na.exclude)
## 
## Coefficients:
## (Intercept)       POPDEN       BORN_F       POVERT  
##     9.53670     -0.10966      0.09307      0.02318
modelAIC_ <- lm(CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + TEMPER, data = df, na.action = na.exclude)
modelAIC <- lm.beta(modelAIC_)

summary(modelAIC)
## 
## Call:
## lm(formula = CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + 
##     TEMPER, data = df, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41000 -0.11529  0.00209  0.11855  0.44871 
## 
## Coefficients:
##              Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  9.664651           NA   0.782360  12.353  < 2e-16 ***
## AREA        -0.493029    -1.680253   0.236098  -2.088  0.04118 *  
## POP80        0.447200     1.265294   0.224031   1.996  0.05062 .  
## POPDEN      -0.613315    -1.798657   0.257385  -2.383  0.02048 *  
## BORN_F       0.142206     0.509047   0.044576   3.190  0.00229 ** 
## POVERT       0.016202     0.358145   0.006196   2.615  0.01135 *  
## TEMPER       0.008129     0.197515   0.005368   1.514  0.13536    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2081 on 58 degrees of freedom
## Multiple R-squared:  0.3288, Adjusted R-squared:  0.2594 
## F-statistic: 4.735 on 6 and 58 DF,  p-value: 0.000548
AIC(modelAIC)
## [1] -11.00726
plot_ellipse(model_manual, c(1, 2))

plot_ellipse(model_manual, c(1, 3))

plot_ellipse(model_manual, c(1, 4))

plot_ellipse(model_manual, c(2, 3))

plot_ellipse(model_manual, c(2, 4))

plot_ellipse(model_manual, c(3, 4))

plot_ellipse(modelAIC_, c(2, 3))

plot_ellipse(modelAIC_, c(2, 4))

plot_ellipse(modelAIC_, c(2, 7))

plot_ellipse(modelAIC_, c(3, 4))

plot_ellipse(modelAIC_, c(3, 7))

plot_ellipse(modelAIC_, c(4, 5))

# plot_ellipse(modelAIC_, c(1, 2))
# plot_ellipse(modelAIC_, c(1, 3))
# plot_ellipse(modelAIC_, c(1, 4))
# plot_ellipse(modelAIC_, c(1, 5))
# plot_ellipse(modelAIC_, c(1, 6))
# plot_ellipse(modelAIC_, c(1, 7))
# plot_ellipse(modelAIC_, c(2, 5))
# plot_ellipse(modelAIC_, c(2, 6))
# plot_ellipse(modelAIC_, c(3, 5))
# plot_ellipse(modelAIC_, c(3, 6))
# plot_ellipse(modelAIC_, c(4, 6))
# plot_ellipse(modelAIC_, c(4, 7))
# plot_ellipse(modelAIC_, c(5, 6))
# plot_ellipse(modelAIC_, c(5, 7))
# plot_ellipse(modelAIC_, c(6, 7))

10 Строите обычную регрессию по выбранному числу признаков. Сначала смотрите на нормальность остатков (зачем нужно на это смотреть?), затем смотрите на график Predicted vs Residuals. Как по нему понять, адекватна ли линейная регрессия? Как будет выглядеть график, если на самом деле была квадратичная зависимость (в случае одной независимой переменной)? Как может повлиять на этот график выбор Pairwise deletion для пропущенных наблюдений?

# hist(residuals(modelAIC_))
# shapiro.test(residuals(modelAIC_))

# ols_plot_resid_stud_fit(modelAIC_)

# # print(df_names[c(15, 27, 38, 56, 52), ])

# ols_plot_resid_stud(modelAIC_)
# # print(df_names[56, ])



hist(residuals(model_manual))

shapiro.test(residuals(model_manual))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_manual)
## W = 0.97945, p-value = 0.3524
ols_plot_resid_stud_fit(model_manual)

# print(df_names[c(15, 27, 38, 56, 52), ])

ols_plot_resid_stud(model_manual)

# print(df_names[56, ])

model_manual_ <- model_manual
model_manual <- lm.beta(model_manual)

11 Далее переходите к поиску outliers. Сначала смотрите на скаттерплот Residuls vs Deleted Residuals. Нужно объяснить, что этот такое, что откладывается по осям, как там найти outliers.

# ggplot(data_frame(residuals = rstandard(modelAIC), studres = studres(modelAIC)), aes(x = residuals, y = studres)) +
#     geom_point() +
#     geom_abline(slope = 1, intercept = 0, color = "red")

ggplot(data_frame(residuals = rstandard(model_manual), studres = studres(model_manual)), aes(x = residuals, y = studres)) +
    geom_point() +
    geom_abline(slope = 1, intercept = 0, color = "red")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.

# ggplot(data_frame(residuals = rstandard(modelAIC), studres = studres(modelAIC)), aes(x = residuals, y = studres - residuals)) +
#     geom_point() +
#     geom_abline(slope = 0, intercept = 0, color = "red")

Новое

новые графики, в том числе, график с рычагами - остатками.

plot(model_manual)

12 Затем смотрите на разные меры, например, outliers по Куку и по Махаланобису (в пространстве регрессоров). Объясняете, что это такое, по отношению к чему это outliers. Умеете приводить пример, где, в случае одной независимой переменной, находится outliers по Куку, но не по Махаланобису, и наоборот.

plot(sort(mahalanobis_distance(df)$mahal.dist, decreasing = TRUE))

# plot(cooks.distance(modelAIC))

# df_names[which(cooks.distance(modelAIC) > 0.06), ]


plot(sort(cooks.distance(model_manual), decreasing = TRUE))

df_names[which(cooks.distance(model_manual) > 0.1), ]
## # A tibble: 1 × 12
##   CITY    STATE  AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
##   <chr>   <chr> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>
## 1 EL_PASO TX     5.50  13.0  13.2   7.70  9.17   3.15   25.3  23460  2.36   82.3

13 Итог: результат линейной регрессии, для которой проверена адекватность модели, значимость, отсутствие outliers, проинтерпретированы коэффициенты регрессии. Спрогнозируйте что-нибудь по построенной регрессионной модели, посмотрите на доверительные и предсказательные интервалы.

тут прогноз того что оказалось аутлаером

new <- data_logged %>%
    filter(CITY == "EL_PASO") %>%
    select(POPDEN, BORN_F, POVERT)

Новое

Попытка прогнозировать преступность в Петербуре

predict.lm(model_manual_, new)
##        1 
## 9.571922
exp(predict.lm(model_manual_, data.frame(POPDEN = log(13342), BORN_F = log(5.3), POVERT = 6.6), interval = "prediction"))
##        fit      lwr      upr
## 1 6656.181 4127.882 10733.04
exp(predict.lm(model_manual_, data.frame(POPDEN = log(13342), BORN_F = log(5.3), POVERT = 6.6), interval = "confidence"))
##        fit     lwr      upr
## 1 6656.181 5332.49 8308.453

преступность за 2022 год в СПб на 100000 чел.

print(62971 / (7500000 / 100000))
## [1] 839.6133

Плохо согласуется потому что модель построена на американских городах по данным прошлого века. Данные собирались по разному, законы разные…